# PSRM Word2vec Dictionary Creation Nov 2025
rm(list=ls())
getwd()
sessionInfo()
library(dplyr)
library("quanteda")
library('glmnet')
library("word2vec")

# Loading the Data --------------------------------------------------------

## Data not publicly available due to copyright restriction
### Data at https://drive.google.com/file/u/1/d/14KN84eY-G6zefTiCdQhu9kDWPmfPbT2i/view?usp=sharing for original replication process

xi_tokens_lines <- readLines("xi_tokens_nov_2024.txt")
hu_tokens_lines <- readLines("hu_tokens_nov_2024.txt")
jiang_tokens_lines <- readLines("jiang_tokens_nov_2024.txt")
mao_tokens_lines <- readLines("mao_token_nov_2024.txt")
deng_tokens_lines <- readLines("deng_tokens_nov_2024.txt")

# word2vec to all leadership corpus ---------------------------------------

#  Training text files by leadership from 100 to 300 dim ------------------


# Xi word2vec -------------------------------------------------------------
set.seed(123)

xi_
word2vec_models <- list(
  "100" = word2vec(x = xi_tokens_lines, type = "skip-gram", dim = 100, iter = 15, window = 5, min_count = 5, threads = 4),
  "200" = word2vec(x = xi_tokens_lines, type = "skip-gram", dim = 200, iter = 15, window = 5, min_count = 5, threads = 4),
  "250" = word2vec(x = xi_tokens_lines, type = "skip-gram", dim = 250, iter = 15, window = 5, min_count = 5, threads = 4),
  "300" = word2vec(x = xi_tokens_lines, type = "skip-gram", dim = 300, iter = 15, window = 5, min_count = 5, threads = 4)
)

# Save models for replication
for (dim in names(xi_word2vec_models)) {
  path <- paste0("xi_word2vec_model_", dim, ".bin")
  write.word2vec(xi_word2vec_models[[dim]], file = path)
}

# Convert models to matrices
xi_matrices <- lapply(xi_word2vec_models, as.matrix)

# Define seed words for implicit threat predictions
seed_words <- c("一切后果", "后果", "严重后果")

# Predict nearest words for each model
predictions_list <- list()
for (dim in names(xi_word2vec_models)) {
  for (word in seed_words) {
    key <- paste0("dim_", dim, "_", word)
    predictions_list[[key]] <- predict(xi_word2vec_models[[dim]], word, type = "nearest", top_n = 100)
  }
}

# Function to reshape predictions to long format
reshape_to_long <- function(predictions, term_name, dimension) {
  df <- as.data.frame(predictions) 
  colnames(df) <- c("predicted_word", "cos_similarity", "rank")
  df$frame <- term_name
  df$leader <- "Xi"
  df$word2vec_dimension <- dimension
  return(df)
}

# Convert predictions to a single dataframe
xi_long_df <- do.call(rbind, lapply(names(predictions_list), function(name) {
  parts <- strsplit(name, "_")[[1]]
  dim_val <- parts[2]  # Extract dimension
  seed_term <- paste(parts[3:length(parts)], collapse = "_")  # Extract seed word
  reshape_to_long(predictions_list[[name]], seed_term, dim_val)
}))

# Rename columns
colnames(xi_long_df) <- c("seed words","predicted words", "cos similarity", "rank", "frame", "leader", "word2vec_dimension")


# Hu word2vec -------------------------------------------------------------
set.seed(123)
hu_word2vec_models <- list(
  "100" = word2vec(x = hu_tokens_lines, type = "skip-gram", dim = 100, iter = 15, window = 5, min_count = 5, threads = 4),
  "200" = word2vec(x = hu_tokens_lines, type = "skip-gram", dim = 200, iter = 15, window = 5, min_count = 5, threads = 4),
  "250" = word2vec(x = hu_tokens_lines, type = "skip-gram", dim = 250, iter = 15, window = 5, min_count = 5, threads = 4),
  "300" = word2vec(x = hu_tokens_lines, type = "skip-gram", dim = 300, iter = 15, window = 5, min_count = 5, threads = 4)
)

# Save models for replication
for (dim in names(hu_word2vec_models)) {
  path <- paste0("hu_word2vec_model_", dim, ".bin")
  write.word2vec(hu_word2vec_models[[dim]], file = path)
}

# Convert models to matrices
hu_matrices <- lapply(hu_word2vec_models, as.matrix)

# Define seed words for implicit threat predictions
seed_words <- c("一切后果", "后果", "严重后果")

# Predict nearest words for each model
predictions_list <- list()
for (dim in names(hu_word2vec_models)) {
  for (word in seed_words) {
    key <- paste0("dim_", dim, "_", word)
    predictions_list[[key]] <- predict(hu_word2vec_models[[dim]], word, type = "nearest", top_n = 100)
  }
}



# Convert predictions to a single dataframe
hu_long_df <- do.call(rbind, lapply(names(predictions_list), function(name) {
  parts <- strsplit(name, "_")[[1]]
  dim_val <- parts[2]  # Extract dimension
  seed_term <- paste(parts[3:length(parts)], collapse = "_")  # Extract seed word
  reshape_to_long(predictions_list[[name]], seed_term, dim_val)
}))

# Rename columns
colnames(hu_long_df) <- c("seed words","predicted words", "cos similarity", "rank", "frame", "leader", "word2vec_dimension")


# jiang word2vec ----------------------------------------------------------
jiang_word2vec_models <- list(
  "100" = word2vec(x = jiang_tokens_lines, type = "skip-gram", dim = 100, iter = 15, window = 5, min_count = 5, threads = 4),
  "200" = word2vec(x = jiang_tokens_lines, type = "skip-gram", dim = 200, iter = 15, window = 5, min_count = 5, threads = 4),
  "250" = word2vec(x = jiang_tokens_lines, type = "skip-gram", dim = 250, iter = 15, window = 5, min_count = 5, threads = 4),
  "300" = word2vec(x = jiang_tokens_lines, type = "skip-gram", dim = 300, iter = 15, window = 5, min_count = 5, threads = 4)
)

# Save models for replication
for (dim in names(jiang_word2vec_models)) {
  path <- paste0("jiang_word2vec_model_", dim, ".bin")
  write.word2vec(jiang_word2vec_models[[dim]], file = path)
}

# Convert models to matrices
jiang_matrices <- lapply(jiang_word2vec_models, as.matrix)

# Define seed words for implicit threat predictions
seed_words <- c("一切后果", "后果", "严重后果")

# Predict nearest words for each model
predictions_list <- list()
for (dim in names(jiang_word2vec_models)) {
  for (word in seed_words) {
    key <- paste0("dim_", dim, "_", word)
    predictions_list[[key]] <- predict(jiang_word2vec_models[[dim]], word, type = "nearest", top_n = 100)
  }
}


# Convert predictions to a single dataframe
jiang_long_df <- do.call(rbind, lapply(names(predictions_list), function(name) {
  parts <- strsplit(name, "_")[[1]]
  dim_val <- parts[2]  # Extract dimension
  seed_term <- paste(parts[3:length(parts)], collapse = "_")  # Extract seed word
  reshape_to_long(predictions_list[[name]], seed_term, dim_val)
}))

# Rename columns
colnames(jiang_long_df) <- c("seed words","predicted words", "cos similarity", "rank", "frame", "leader", "word2vec_dimension")

# deng word2vec -----------------------------------------------------------
set.seed(123)
deng_word2vec_models <- list(
  "100" = word2vec(x = deng_tokens_lines, type = "skip-gram", dim = 100, iter = 15, window = 5, min_count = 5, threads = 4),
  "200" = word2vec(x = deng_tokens_lines, type = "skip-gram", dim = 200, iter = 15, window = 5, min_count = 5, threads = 4),
  "250" = word2vec(x = deng_tokens_lines, type = "skip-gram", dim = 250, iter = 15, window = 5, min_count = 5, threads = 4),
  "300" = word2vec(x = deng_tokens_lines, type = "skip-gram", dim = 300, iter = 15, window = 5, min_count = 5, threads = 4)
)

# Save models for replication
for (dim in names(deng_word2vec_models)) {
  path <- paste0("deng_word2vec_model_", dim, ".bin")
  write.word2vec(deng_word2vec_models[[dim]], file = path)
}

# Convert models to matrices
deng_matrices <- lapply(deng_word2vec_models, as.matrix)

# Define seed words for implicit threat predictions
seed_words <- c("一切后果", "后果", "严重后果")

# Predict nearest words for each model
predictions_list <- list()
for (dim in names(deng_word2vec_models)) {
  for (word in seed_words) {
    key <- paste0("dim_", dim, "_", word)
    predictions_list[[key]] <- predict(deng_word2vec_models[[dim]], word, type = "nearest", top_n = 100)
  }
}



# Convert predictions to a single dataframe
deng_long_df <- do.call(rbind, lapply(names(predictions_list), function(name) {
  parts <- strsplit(name, "_")[[1]]
  dim_val <- parts[2]  # Extract dimension
  seed_term <- paste(parts[3:length(parts)], collapse = "_")  # Extract seed word
  reshape_to_long(predictions_list[[name]], seed_term, dim_val)
}))

# Rename columns
colnames(deng_long_df) <- c("seed words","predicted words", "cos similarity", "rank", "frame", "leader", "word2vec_dimension")


# mao word2vec ------------------------------------------------------------
set.seed(123)
mao_word2vec_models <- list(
  "100" = word2vec(x = mao_tokens_lines, type = "skip-gram", dim = 100, iter = 15, window = 5, min_count = 5, threads = 4),
  "200" = word2vec(x = mao_tokens_lines, type = "skip-gram", dim = 200, iter = 15, window = 5, min_count = 5, threads = 4),
  "250" = word2vec(x = mao_tokens_lines, type = "skip-gram", dim = 250, iter = 15, window = 5, min_count = 5, threads = 4),
  "300" = word2vec(x = mao_tokens_lines, type = "skip-gram", dim = 300, iter = 15, window = 5, min_count = 5, threads = 4)
)

# Save models for replication
for (dim in names(mao_word2vec_models)) {
  path <- paste0("mao_word2vec_model_", dim, ".bin")
  write.word2vec(mao_word2vec_models[[dim]], file = path)
}

# Convert models to matrices
mao_matrices <- lapply(mao_word2vec_models, as.matrix)

# Define seed words for implicit threat predictions
seed_words <- c("一切后果", "后果", "严重后果")

# Predict nearest words for each model
predictions_list <- list()
for (dim in names(mao_word2vec_models)) {
  for (word in seed_words) {
    key <- paste0("dim_", dim, "_", word)
    predictions_list[[key]] <- predict(mao_word2vec_models[[dim]], word, type = "nearest", top_n = 100)
  }
}


# Convert predictions to a single dataframe
mao_long_df <- do.call(rbind, lapply(names(predictions_list), function(name) {
  parts <- strsplit(name, "_")[[1]]
  dim_val <- parts[2]  # Extract dimension
  seed_term <- paste(parts[3:length(parts)], collapse = "_")  # Extract seed word
  reshape_to_long(predictions_list[[name]], seed_term, dim_val)
}))

# Rename columns
colnames(mao_long_df) <- c("seed words","predicted words", "cos similarity", "rank", "frame", "leader", "word2vec_dimension")


# Data that used to generate Chinese-language Lexicon ---------------------
all_pd_foreign_lexi <-rbind(mao_long_df, jiang_long_df, xi_long_df, hu_long_df, deng_long_df)  # Total 6000 Observations
all_pd_foreign_lexi_non_duplicated <- all_pd_foreign_lexi[!duplicated(all_pd_foreign_lexi$`predicted words`),] 
all_pd_foreign_lexi_non_duplicated <- all_pd_foreign_lexi_non_duplicated[, c("predicted words", "cos similarity", "leader", "word2vec_dimension")]
# File for dictionary creation --------------------------------------------


write.csv(all_pd_foreign_lexi_non_duplicated, "all_pd_foreign_lexi_all_wave_march.csv") # for replication
###########